#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : config_spider
import scrapy
import json
import math
import time
import urllib.parse

from apps.tax_policy.tax_policy.items import NetTaxPolicyItem as Item
from apps.tax_policy.tax_policy.spiders.base_spider.base_tax_policy_spider import BaseTaxPolicySpider


class XinjiangHetianPhpPolicy(BaseTaxPolicySpider):
    name = 'xinjiang_hetian_php_policy'
    
    province = '新疆维吾尔自治区'
    city: str = "和田地区"  # 取表格
    # county: str = ""  # 取表格
    park: str = ""  # 取表格

    custom_settings = {
        "DEFAULT_REQUEST_HEADERS": {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "Accept-Language": "zh,en;q=0.9,zh-CN;q=0.8",
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
            "sec-ch-ua": "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\""
        }
    }

    @classmethod
    def update_settings(cls, settings) -> None:
        downloader_middlewares = settings.getdict("DOWNLOADER_MIDDLEWARES")
        new_settings = {
            **(cls.custom_settings or {}),
            **{
                "DOWNLOADER_MIDDLEWARES": {
                    "components.middlewares.downloadmiddlewares.public.jsl_middlewares.JslMiddleware": 543,
                    **downloader_middlewares
                }
            },
        }
        settings.setdict(new_settings or {}, priority="spider")

    def start_requests(self):
        for source, county, url in [
            ['新疆维吾尔自治区和田地区行政公署', "", "https://www.xjht.gov.cn/article/list.php?catid=244"],
            ['新疆维吾尔自治区和田地区行政公署', "", "https://www.xjht.gov.cn/article/list.php?catid=149"],
            ['和田市人民政府', "和田市", "https://www.hts.gov.cn/xinxigongkai/list.php?catid=178"],
            ['和田市人民政府', "和田市", "https://www.hts.gov.cn/xinxigongkai/list.php?catid=185"],
        ]:

            item = {'source': source, 'county': county}
            yield scrapy.Request(url, callback=self.parse_list, meta={'item': item})

    def parse_list(self, response, **kwargs):
        prev_item = response.meta.get('item')
        elem_list = response.xpath(
            "//div[@class='info']/a[contains(@href, 'php') or @title]/ancestor::div[string-length(string(.))>=5] | "
            "//div[@class='ulist']//dd//a[contains(@href, 'php') or @title]/ancestor::dd[string-length(string(.))>=5]"
        )
        for elem in elem_list:
            item = Item()
            item['source_url'] = elem.xpath("""./a/@href""").re_first(r"""(.*)""")
            item['source_url'] = response.urljoin(item['source_url'])
            item['publish_date'] = elem.xpath("""string(.)""").get()
            if prev_item is not None:
                for key, value in prev_item.items():
                    item[key] = value
            if not item['source_url']:
                continue
            print(response.url, item['source_url'])
            yield response.follow(item['source_url'], callback=self.parse_detail, meta={'item': item})
        if response.meta.get("is_next") is not False:
            pages = response.xpath(""".""").re_first(r'条/(\d+)页')
            print(response.url, f"page: {pages}")
            for page_num in range(2, int(pages) + 1):
                url = response.url + f'&page={page_num}'
                yield response.follow(url, callback=self.parse_list, meta={'item': prev_item, 'is_next': False}, headers=self.headers)

    def parse_detail(self, response, **kwargs):
        item = Item() if response.meta.get('item') is None else response.meta.get('item')
        item['title'] = response.xpath("""string(//meta[@name='ArticleTitle']/@content)""").get()
        item['publish_date'] = response.xpath("""string(//meta[@name='PubDate']/@content)""").get() or response.xpath("//*[@class='fbrq']/text()").get()
        item['content'] = response.xpath(""".""").get()
        # item['source'] = self.source
        item['province'] = self.province
        item['city'] = self.city
        # item['county'] = self.county
        item['park'] = self.park
        yield item


if __name__ == "__main__":
    from scrapy import cmdline

    cmdline.execute("scrapy crawl xinjiang_hetian_php_policy".split())
